Homework 2

Author

Eva Wang

Task 1

This link – https://bioguide.congress.gov/search – contains information about every person who has ever served in the United States Congress. Your goal is to download whatever you want; you could only go with the most current Congress, choose a few years, or go all the way back in time and get everyone. The choice is yours, but you have to provide the reason for your choice. You don’t need to scrape anything; you can use the download button on the page! While you have some options, you must download the json files (no reason to take the easy way by getting a csv). The main information that you want out of each person’s page is their “profileText” and you need to choose at least one other piece of information to bring in with the profile – think about what might be useful. If you use R, you are going to want to make use of the list.files function (be sure to look at the arguments); Python users will likely use os.listdir. No matter which language you choose, you will probably need to use some type of error handling when reading the files.

library(rjson)
congress <- fromJSON(file = "~/Documents/Mod 3/Unstructured Data Analytics/Homework/Homework 2/BioguideProfiles/congress.json")

filtered_ID <- purrr::map_df(1:length(congress), ~{
  data.frame(ID = congress[.x][[1]]$id)}) # get their IDs

congress_df <- purrr::map_df(1:length(congress), ~{
  #setwd("~/Documents/Mod 3/Unstructured Data Analytics/Homework/Homework 2/BioguideProfiles/")
  read <- fromJSON(file = paste0("~/Documents/Mod 3/Unstructured Data Analytics/Homework/Homework 2/BioguideProfiles/", 
                                 filtered_ID[.x,1], ".json"))
  data.frame(ID = read$usCongressBioId,
             FirstName = read$givenName,
             LastName = read$familyName,
             Latest_Position = read$jobPositions[length(read$jobPositions)][[1]]$job$name,
             Latest_Party = read$jobPositions[length(read$jobPositions)][[1]]$congressAffiliation$partyAffiliation[[1]]$party$name,
             Latest_State = read$jobPositions[length(read$jobPositions)][[1]]$congressAffiliation$represents$regionCode,
             Profile = read$profileText)
})

str(congress_df)
'data.frame':   241 obs. of  7 variables:
 $ ID             : chr  "A000376" "A000375" "A000148" "B001291" ...
 $ FirstName      : chr  "Colin" "Jodey" "Jake" "Brian" ...
 $ LastName       : chr  "Allred" "Arrington" "Auchincloss" "Babin" ...
 $ Latest_Position: chr  "Representative" "Representative" "Representative" "Representative" ...
 $ Latest_Party   : chr  "Democrat" "Republican" "Democrat" "Republican" ...
 $ Latest_State   : chr  "TX" "TX" "MA" "TX" ...
 $ Profile        : chr  "A Representative from Texas; born in Dallas, Dallas County, Tex., April 15, 1983; graduated from Hillcrest High"| __truncated__ "A Representative from Texas; born in Plainview, Tex., March 9, 1972; graduated from Plainview High School, Plai"| __truncated__ "a Representative from Massachusetts; born in Boston, Suffolk County, Mass., on January 29, 1988; graduated from"| __truncated__ "A Representative from Texas; born in Port Arthur, Jefferson County, Tex., March 23, 1948; graduated from Forest"| __truncated__ ...
head(congress_df)
       ID FirstName    LastName Latest_Position Latest_Party Latest_State
1 A000376     Colin      Allred  Representative     Democrat           TX
2 A000375     Jodey   Arrington  Representative   Republican           TX
3 A000148      Jake Auchincloss  Representative     Democrat           MA
4 B001291     Brian       Babin  Representative   Republican           TX
5 B001307     James       Baird  Representative   Republican           IN
6 B001299     James       Banks  Representative   Republican           IN
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Profile
1                                                                                                                                                                                                                                                                                                                                                                               A Representative from Texas; born in Dallas, Dallas County, Tex., April 15, 1983; graduated from Hillcrest High School, Dallas, Tex., 2001; B.A., Baylor University, Waco, Tex., 2005; J.D., University of California, Berkeley, Calif., 2014; professional athlete; nonprofit executive; lawyer; staff, United States Department of Housing and Urban Development, 2016-2017; elected as a Democrat to the One Hundred Sixteenth and to the two succeeding Congresses (January 3, 2019-present).
2                                                                                   A Representative from Texas; born in Plainview, Tex., March 9, 1972; graduated from Plainview High School, Plainview, Hale County, Tex., 1990; B.A., Texas Tech University, Lubbock, Tex., 1994; M.A., Texas Tech University, Lubbock, Tex., 1997; special assistant to President George W. Bush, 2001; staff, chairman of the Federal Deposit Insurance Corporation, 2001-2005; Deputy Federal Coordinator and Chief Operating Officer, Office of the Federal Coordinator for Gulf Coast Rebuilding, 2005-2006; administrator, Texas Tech University, Lubbock, Tex., 2007-2014; business executive; elected as a Republican to the One Hundred Fifteenth and to the three succeeding Congresses (January 3, 2017-present); chair, Committee on the Budget (One Hundred Eighteenth Congress).
3                                                                                                                                                                                                                                                                                                                                                      a Representative from Massachusetts; born in Boston, Suffolk County, Mass., on January 29, 1988; graduated from Newton North High School, Newton, Mass., 2006; A.B., Harvard University, Cambridge, Mass., 2010; M.B.A., Massachusetts Institute of Technology, Cambridge, Mass., 2016; United States Marine Corps, 2010-2015; member of the Newton, Mass., city council, 2016-2020; business professional; elected as a Democrat to the One Hundred Seventeenth and to the succeeding Congress (January 3, 2021-present).
4 A Representative from Texas; born in Port Arthur, Jefferson County, Tex., March 23, 1948; graduated from Forest Park High School, Beaumont, Tex., 1966; B.S., Lamar University, Beaumont, Tex., 1973; D.D.S., University of Texas, Houston, Tex., 1976; United States Air Force, 1975-1979; Texas Army National Guard, 1969-1975; dentist; alderman, Woodville, Tex., 1981-1982, 1984-1989; Mayor of Woodville, Tex., 1982-1984; member, Texas Historical Commission, 1989-1995; member of the Woodville, Tex., school board, 1992-1995; unsuccessful candidate for election to the One Hundred Fifth Congress in 1996; unsuccessful candidate for election to the One Hundred Sixth Congress in 1998; member, Lower Neches Valley Authority, 1999-2014; elected as a Republican to the One Hundred Fourteenth and to the four succeeding Congresses (January 3, 2015-present).
5                                                                                                                                                                                                                                                                                      a Representative from Indiana; born in Fountain County, Ind., June 4, 1945; graduated from Turkey Run High School, Marshall, Ind., 1963; B.S., Purdue University, West Lafayette, Ind., 1967; M.S., Purdue University, West Lafayette, Ind., 1969; Ph.D., University of Kentucky, Lexington, Ky., 1975; United States Army, 1969-1972; farmer; small business owner; commissioner, Putnam County, Ind., 2006-2010; member of the Indiana general assembly, 2010-2018; elected as a Republican to the One Hundred Sixteenth and to the two succeeding Congresses (January 3, 2019-present).
6                                                                                                                                                                                          A Representative from Indiana; born in Columbia City, Whitley County, Ind., July 16, 1979; graduated from Columbia City High School, Columbia City, Ind., 1997; B.A., Indiana University, Bloomington, Ind., 2004; M.B.A., Grace College and Seminary, Winona Lake, Ind., 2013; United States Navy Reserve, 2012-present, Afghanistan War Veteran, 2014-2015; commercial real estate broker; chairman, Whitley County, Ind., Republican Party, 2007-2011; member of the Whitley County, Ind. Council, 2008-2010; member of the Indiana state senate, 2010-2016; elected as a Republican to the One Hundred Fifteenth and to the three succeeding Congresses (January 3, 2017-present).

I picked all congress from 114 to now, which captures the time I began living in the US from 2016. In addition, I filtered the list for Pennsylvania, Indiana, and Texas which are places I lived, live and will live. And I further included Illinois, Massachusetts, and New York which are places I thought about living and working. Also I have Maine (too cold), New Mexico (salute to Mr. White, but pass), Idaho (no thanks, university murder) as places I doubt I would ever live in.

Task 2

Now that you have your text data, you need to do some exploration, clean up the text, and then create a document-term matrix. You don’t need to do any analyses yet, but really think through your cleaning process. Every choice you make will have a consequence later. Save your tf-idf object.

library(dplyr)
library(tm)
library(quanteda)
library(stringr)

profile <- data.frame(ID = congress_df$ID,
                      Text = congress_df$Profile)

# Clean
profile$Text <- profile$Text %>% 
  gsub("([a-z])([A-Z])", "\\1 \\2", .) %>% 
  textclean::replace_contraction() %>% 
  tm::removeWords(., words = c(state.name, state.abb)) %>% 
  tolower() %>% 
  str_replace_all(., "&amp;", "") %>%  # the weird &
  tm::removeWords(., words = c(stopwords("en"), stopwords(source = "smart"), "school", "university")) %>% 
  str_replace_all(., "[^[:alnum:]]", " ") %>% # drop non alphabet elements
  gsub('[[:digit:]]+', '', .) %>% # drop numbers
  textstem::lemmatize_strings(.)

# Corpus
profile_corpus <- Corpus(VectorSource(profile$Text))
profile_dtm <- DocumentTermMatrix(profile_corpus,
                               control = list(weighting = function(x) {
                                 weightTfIdf(x, normalize = TRUE)},
                                 tolower = TRUE,
                                 removePunctuation = TRUE,
                                 removeNumbers = TRUE,
                                 stopwords = TRUE,
                                 stemming = function(x) lemmatize_strings(x)
                                 ))

inspect(profile_dtm) 
<<DocumentTermMatrix (documents: 241, terms: 1516)>>
Non-/sparse entries: 8460/356896
Sparsity           : 98%
Maximal term length: 16
Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
Sample             :
     Terms
Docs  chicago    houston ill ind mass     member philadelphia staff       tex
  10        0 0.00000000   0   0    0 0.00000000   0.00000000     0 0.0000000
  107       0 0.00000000   0   0    0 0.00000000   0.00000000     0 0.1424504
  112       0 0.00000000   0   0    0 0.01143180   0.00000000     0 0.1112894
  114       0 0.00000000   0   0    0 0.00000000   0.08506281     0 0.0000000
  121       0 0.00000000   0   0    0 0.00000000   0.00000000     0 0.0000000
  159       0 0.00000000   0   0    0 0.01898111   0.00000000     0 0.0000000
  233       0 0.06905695   0   0    0 0.00000000   0.00000000     0 0.0000000
  36        0 0.07041100   0   0    0 0.00000000   0.00000000     0 0.0000000
  69        0 0.00000000   0   0    0 0.00000000   0.00000000     0 0.0000000
  92        0 0.00000000   0   0    0 0.01862961   0.00000000     0 0.0000000
     Terms
Docs  unsuccessful
  10     0.0000000
  107    0.0000000
  112    0.0000000
  114    0.0000000
  121    0.0000000
  159    0.0000000
  233    0.0309444
  36     0.0000000
  69     0.0000000
  92     0.0000000
saveRDS(profile_dtm, file = "profile_dtm.rds")
# Not relevant
# Not suppose to show, but hey here it is
library(dplyr)
# Term Frequency
profileTF <- profile %>% 
  split(., .$ID) %>%
  lapply(., function(x) {
    songTokens = tm::MC_tokenizer(x$Text)
    tokenCount = as.data.frame(summary(as.factor(songTokens), maxsum = 1000))
    total = length(songTokens)
    tokenCount = data.frame(count = tokenCount[[1]], 
                            word = row.names(tokenCount),
                            total = total,
                            ID = x$ID,
                            row.names = NULL)
    return(tokenCount)
    }) 

profileTF <- do.call("rbind", profileTF) 
profileTF$tf <- profileTF$count/profileTF$total

# Show most frequent
head(profileTF[order(profileTF$tf, decreasing = TRUE), ]) 

# Inverse Document Frequency
profile_idf <- profileTF %>% 
  dplyr::group_by(ID) %>% 
  dplyr::count() %>% 
  dplyr::mutate(idf = log((length(unique(profileTF$ID)) / n)))

# tf-idf
profile_tfidf <- merge(profileTF, profile_idf, by = "ID")
profile_tfidf$tfIDF <- profile_tfidf$tf * profile_tfidf$idf

saveRDS(profile_tfidf, file = "profile_tfidf.rds")
head(profile_tfidf[order(profile_tfidf$tfIDF, decreasing = TRUE),], 20)

Fun Exploration

donkey_or_elephant <- data.frame(party = congress_df$Latest_Party, 
                                 state = congress_df$Latest_State)
table(donkey_or_elephant)
             state
party         ID IL IN MA ME NM NY PA TX
  Democrat     0 21  4 14  2  9 32 15 18
  Independent  0  0  0  0  1  0  0  0  0
  Republican   5 11 16  0  2  2 22 20 47

Task 3

We are going to return to the Population Centers page: https://www.census.gov/geographies/reference-files/time-series/geo/centers-population.html

Now, your task is to get every link for “Centers of Population by County” and plot each county’s population center, just for 2020. You cannot use the link for the entire US. The task isn’t much different than you did in homework 1, but pay attention to the format of those links.

library(rvest)
library(mapview)

census_link <- read_html('https://www.census.gov/geographies/reference-files/time-series/geo/centers-population.html')

county_link<- census_link %>% 
  html_nodes("noscript") %>% 
  html_elements("a[href]") 
  
county_link <- county_link[grep('CO', county_link)]

county_data <- purrr::map_df(1:length(county_link), ~{
  one_county_link <- county_link[[.x]] %>% html_attr("href") 
  county_read <- read.csv(paste0('https:', one_county_link))
  
  data.frame(STATEFP = county_read$STATEFP,
             COUNTYFP = county_read$COUNTYFP,
             COUNAME = county_read$COUNAME,
             STNAME = county_read$STNAME,
             POPULATION = county_read$POPULATION,
             LATITUDE = county_read$LATITUDE,
             LONGITUDE = county_read$LONGITUDE)})

plot <- mapview() +
    mapview(county_data, xcol = "LONGITUDE", ycol = "LATITUDE", zcol = "POPULATION",
            cex = 2, grid = FALSE, alpha = 1)
plot

Plotly

library(plotly)
library(dplyr)

# Styling
geo_style <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showland = TRUE,
  landcolor = toRGB("gray95"),
  subunitcolor = toRGB("gray85"),
  countrycolor = toRGB("gray85"),
  showlakes = TRUE,
  lakecolor = toRGB("white"),
  countrywidth = 0.5,
  subunitwidth = 0.5)

# Viz

p <- plot_geo(county_data, lat = ~LATITUDE, lon = ~LONGITUDE,
              frame = ~STNAME) %>% 
  add_markers(
    text = ~paste(paste(COUNAME, ", ", STNAME), paste("Population:", POPULATION), sep = "<br />"),
    color = ~POPULATION, symbol = I("circle"), size = I(8),
    hoverinfo = "text") %>% 
  colorbar(title = "Population") %>% 
  layout(title = "Centers of Population by County in 2020", geo = geo_style) %>% 
  config(displayModeBar = FALSE)

p